Setup the Seurat Object

library(dplyr)
library(Seurat)
library(patchwork)

# Load the PBMC dataset
pbmc.data <- Read10X(data.dir = "/home/lambi/Desktop/studia/ModZlo/lab7/fresh68/hg19/")
# Initialize the Seurat object with the raw (non-normalized data).
pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc68k", min.cells = 3, min.features = 200)
pbmc
## An object of class Seurat 
## 17788 features across 68551 samples within 1 assay 
## Active assay: RNA (17788 features, 0 variable features)
##  1 layer present: counts
# shrink the dataset - ram problems
set.seed(42)
cells_20k <- sample(colnames(pbmc), size = 20000)
pbmc <- subset(pbmc, cells = cells_20k)

QC and selecting cells for further analysis

pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
# Visualize QC metrics as a violin plot
VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2

# percentile cut-off
max_features <- quantile(pbmc$nFeature_RNA, 0.99)
max_counts <- quantile(pbmc$nCount_RNA, 0.99)

pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < max_features & nCount_RNA < max_counts &
    percent.mt < 5)

Normalizing the data

pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)

Identification of highly variable features (feature selection)

pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)

# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(pbmc), 10)

# plot variable features with and without labels
plot1 <- VariableFeaturePlot(pbmc)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
plot1 + plot2

Scaling the data

pbmc <- ScaleData(pbmc)

Perform linear dimensional reduction

pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc), npcs = 30)
# Examine and visualize PCA results a few different ways
print(pbmc[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1 
## Positive:  CST3, SERPINA1, LST1, RP11-290F20.3, SPI1 
## Negative:  RPS6, RPL13, B2M, RPS2, LTB 
## PC_ 2 
## Positive:  NKG7, GNLY, CST7, GZMB, GZMA 
## Negative:  LTB, RPL13, RPS2, CD79A, RPS6 
## PC_ 3 
## Positive:  RPS2, RPS6, RPL13, RPL34, AIF1 
## Negative:  CD79A, PDLIM1, TCL1A, MS4A1, GNG11 
## PC_ 4 
## Positive:  CD79A, TCL1A, HLA-DPB1, NKG7, CD79B 
## Negative:  PPBP, PF4, SDPR, NRGN, TUBB1 
## PC_ 5 
## Positive:  B2M, MS4A7, VMO1, RP11-290F20.3, CDKN1C 
## Negative:  S100A8, S100A9, LGALS2, CD14, CST3
VizDimLoadings(pbmc, dims = 1:2, reduction = "pca")

DimPlot(pbmc, reduction = "pca")

DimHeatmap(pbmc, dims = 1, cells = 500, balanced = TRUE)

DimHeatmap(pbmc, dims = 1:15, cells = 500, balanced = TRUE)

Determine the ‘dimensionality’ of the dataset

pbmc <- JackStraw(pbmc, num.replicate = 50)
pbmc <- ScoreJackStraw(pbmc, dims = 1:15)  #thresolding
JackStrawPlot(pbmc, dims = 1:15)

ElbowPlot(pbmc)

Cluster the cells

pca.embeddings <- Embeddings(pbmc, "pca")[, 1:15]
kmeans.result <- kmeans(pca.embeddings, centers = 10)
pbmc$kmeans_clusters <- as.factor(kmeans.result$cluster)

Run non-linear dimensional reduction (UMAP/tSNE)

pbmc <- RunUMAP(pbmc, dims = 1:15)
# note that you can set `label = TRUE` or use the LabelClusters function to help label
# individual clusters
DimPlot(pbmc, reduction = "umap")

Homework Problem 2

subpopulation_labels <- rep(NA, ncol(pbmc))
names(subpopulation_labels) <- colnames(pbmc)

for (cluster_id in unique(pbmc$kmeans_clusters)) {
    cluster_cells <- WhichCells(pbmc, expression = kmeans_clusters == cluster_id)
    pca_data <- Embeddings(pbmc, "pca")[cluster_cells, 1:10]

    sil_widths <- sapply(2:6, function(k) {
        km <- kmeans(pca_data, centers = k, nstart = 10)
        sil <- cluster::silhouette(km$cluster, dist(pca_data))
        mean(sil[, 3])
    })

    optimal_k <- which.max(sil_widths) + 1
    km <- kmeans(pca_data, centers = optimal_k, nstart = 20)

    cluster_labels <- paste0("Cluster", cluster_id, "_Sub", km$cluster)
    names(cluster_labels) <- cluster_cells
    subpopulation_labels[cluster_cells] <- cluster_labels
}

pbmc$subpopulation <- subpopulation_labels
library(ggplot2)
tsne_coords <- as.data.frame(Embeddings(pbmc, reduction = "umap"))
colnames(tsne_coords) <- c("UMAP_1", "UMAP_2")

tsne_coords$cluster <- pbmc$kmeans_clusters
tsne_coords$subpopulation <- pbmc$subpopulation

centers <- tsne_coords %>%
    dplyr::group_by(cluster) %>%
    dplyr::summarise(x = median(UMAP_1), y = median(UMAP_2))

ggplot(tsne_coords, aes(x = UMAP_1, y = UMAP_2, color = subpopulation)) + geom_point(size = 1.5,
    alpha = 0.8) + geom_text(data = centers, aes(x = x, y = y, label = cluster), color = "black",
    size = 5, fontface = "bold") + theme_classic() + labs(title = "t-SNE with subpopulations", x = "UMAP 1",
    y = "UMAP 2")

DimPlot(pbmc, reduction = "umap", split.by = "kmeans_clusters", group.by = "subpopulation", ncol = 3)